import reading_adjusting as ra
import pandas as pd
import numpy as np
import os
import sklearn
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import shap
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, plot
init_notebook_mode(connected=False)
root= 'C:/Users/PauloAlves/Documents/Pessoais/MercadoLibre/'
final_tv_id = pd.read_csv(root + "01. Data/dados_tv_id.csv", index_col = 0)
final_tv = pd.read_csv(root + "01. Data/dados_tv.csv", index_col = 0)
final_tv_id.shape
## Deleting variables with >80% of missings
t = final_tv_id.isnull().sum()/final_tv_id.shape[0]
final_t = t[t < 0.8]
final_tv_id_nonull = final_tv_id[final_t.index]
## Selecting only variables related to the product
names = ['__id','__seller_id','__price','__base_price',
'__initial_quantity','__available_quantity','__sold_quantity','__listing_type_id','__start_time','__condition','__accepts_mercadopago',
'__shipping__mode','__shipping__tags','__shipping__local_pick_up','__shipping__free_shipping','__shipping__logistic_type','__shipping__store_pick_up','__status','__tags',
'__warranty','__catalog_listing','BRAND','MODEL','ACCESSORIES_INCLUDED','ASPECT_RATIO','DEPTH',
'DISPLAY_SIZE','DISPLAY_TYPE','GTIN','HDMI_PORTS_NUMBER','HEIGHT','INTEGRATED_APPS','INTEGRATED_VIRTUAL_ASSISTANTS','IS_SMART',
'ITEM_CONDITION','MAX_RESOLUTION','MAX_SPEAKERS_POWER','OPERATIVE_SYSTEM','RESOLUTION_TYPE','SELLER_SKU','SOUND_MODES','SPEAKERS_NUMBER',
'USB_PORTS_NUMBER','VOLTAGE','WEIGHT','WIDTH','WITH_BLUETOOTH','WITH_ETHERNET','WITH_HDMI','WITH_HDR',
'WITH_INTEGRATED_VOICE_COMMAND','WITH_USB','WITH_WI_FI','pictures','COLOR','IS_3D','IS_CURVED','IS_PORTABLE',
'LINE','WITH_AUTO_POWER_OFF','WITH_SCREEN_SHARE_FUNCTION','CONTRAST_RATIO','PROCESSOR_CORES_NUMBER','WITH_NETFLIX','WITH_WEB_BROWSER','WITH_YOUTUBE']
final_tv_id = final_tv_id[list(names)]
final_tv_id.shape
# converting list variables into multilple columns
# transforming shipping tags into multiple columns
final_tv_id = ra.list_to_dataframe(df = final_tv_id, var_name = '__shipping__tags', prefix = 'shipping')
# transforming tags into multiple columns
final_tv_id = ra.list_to_dataframe(df = final_tv_id, var_name = '__tags', prefix = 'product')
# Creating seller dataframe
seller_table = final_tv[[col for col in final_tv.columns if col.startswith("seller")]].copy()
# transforming seller tag variables into multiple columns
seller_table = ra.list_to_dataframe(df = seller_table, var_name = 'seller__tags', prefix = 'seller')
# Selecting some of the seller columns
seller_columns = ['seller__id','seller__seller_reputation__transactions__total','seller__seller_reputation__transactions__canceled','seller__seller_reputation__transactions__ratings__negative',
'seller__seller_reputation__transactions__ratings__positive','seller__seller_reputation__transactions__ratings__neutral','seller__seller_reputation__transactions__completed','seller__seller_reputation__power_seller_status',
'seller__seller_reputation__metrics__claims__rate','seller__seller_reputation__metrics__delayed_handling_time__rate','seller__seller_reputation__metrics__sales__completed','seller__seller_reputation__metrics__cancellations__rate',
'seller__seller_reputation__level_id','seller_normal','seller_user_info_verified','seller_credits_priority_4',
'seller_eshop','seller_mshops','seller_developer','seller_credits_profile',
'seller_messages_as_seller','seller_messages_as_buyer','seller_brand','seller_large_seller',
'seller_medium_seller','seller_credits_priority_3','seller_medium_seller_advanced','seller_credits_priority_2',
'seller_credits_priority_1','seller_from_facebook','seller_credits_active_borrower','seller_credits_open_market']
# Dropping duplicates sellers informations
seller_table = seller_table[seller_columns].drop_duplicates(subset = 'seller__id')
# Merging seller information into TV dataset
final_tv_id = final_tv_id.merge(seller_table, left_on = ['__seller_id'], right_on = ['seller__id'] )
final_tv_id.shape
# Transforming column "ACCESSORIES_INCLUDED" into multiple columns
final_tv_id.loc[:, 'ACCESSORIES_INCLUDED'] = final_tv_id.loc[:, 'ACCESSORIES_INCLUDED'].fillna("No Information")
final_tv_id = ra.str_to_dataframe(df = final_tv_id, var_name = 'ACCESSORIES_INCLUDED' , prefix = 'accessories')
# Transforming column "'INTEGRATED_APPS' into multiple columns
final_tv_id.loc[:, 'INTEGRATED_APPS'] = final_tv_id.loc[:, 'INTEGRATED_APPS'].fillna("No Information")
final_tv_id = ra.str_to_dataframe(df = final_tv_id, var_name = 'INTEGRATED_APPS' , prefix = 'apps')
final_tv_id.drop(['WITH_NETFLIX','WITH_WEB_BROWSER','WITH_YOUTUBE'], axis = 1, inplace = True)
# changing index to be the "id" column
final_tv_id.index = final_tv_id['__id']
final_tv_id.drop(['__id'], axis = 1, inplace = True)
# inserting the value "No Information" to all object/category columns
str_columns = final_tv_id.select_dtypes(include = ['object','category']).columns
final_tv_id[str_columns] = final_tv_id[str_columns].fillna("No Information")
# dropping some other columns that appears to be not usefull
final_tv_id.drop(['__start_time','MODEL','GTIN','ITEM_CONDITION','SELLER_SKU','LINE','seller__id','__seller_id'], axis = 1, inplace = True)
# Creating variable 'Percent_Information'. Pecentage of attributes columns with useful information (not null)
attributes_columns = [ 'ASPECT_RATIO','DEPTH',
'DISPLAY_SIZE','DISPLAY_TYPE','HDMI_PORTS_NUMBER','HEIGHT','INTEGRATED_VIRTUAL_ASSISTANTS','IS_SMART','MAX_RESOLUTION','MAX_SPEAKERS_POWER',
'OPERATIVE_SYSTEM','RESOLUTION_TYPE','SOUND_MODES','SPEAKERS_NUMBER','USB_PORTS_NUMBER','VOLTAGE','WEIGHT','WIDTH',
'WITH_BLUETOOTH','WITH_ETHERNET','WITH_HDMI','WITH_HDR','WITH_INTEGRATED_VOICE_COMMAND','WITH_USB','WITH_WI_FI','pictures',
'COLOR','IS_3D','IS_CURVED','IS_PORTABLE','WITH_AUTO_POWER_OFF','WITH_SCREEN_SHARE_FUNCTION','CONTRAST_RATIO','PROCESSOR_CORES_NUMBER']
final_tv_id['Percent_Information'] = (final_tv_id[attributes_columns] != 'No Information').sum(axis = 1)/len(attributes_columns)
# Creating variable 'Percent_Apps'. Pecentage of apps columns with useful information (not null)
app_columns = [c for c in final_tv_id.columns if c.startswith('apps_')]
final_tv_id['Percent_Apps'] = (final_tv_id[app_columns] == 1).sum(axis = 1)/len(app_columns)
# Creating variable 'Percent_Accessories'. Pecentage of accesspries columns with useful information (not null)
accessories_columns =[c for c in final_tv_id.columns if c.startswith('accessories_')]
final_tv_id['Percent_Accessories'] = (final_tv_id[accessories_columns] == 1).sum(axis = 1)/len(accessories_columns)
# Transforming TV dimensions from text to float (centimeters)
### DEPTH from text to float
final_tv_id['DEPTH'] = final_tv_id['DEPTH'].str.replace(" cm","").replace("No Information", np.nan)
mask_mm = (final_tv_id['DEPTH'].str.find(" mm") > 0)
final_tv_id.loc[ mask_mm,['DEPTH']] = final_tv_id.loc[ mask_mm,'DEPTH'].str.replace(" mm","").astype(float)/10
mask_po = (final_tv_id['DEPTH'].str.find(' "') > 0)
final_tv_id.loc[ mask_po,['DEPTH']] = final_tv_id.loc[ mask_po,'DEPTH'].str.replace(' "',"").astype(float)*2.54
final_tv_id['DEPTH'] = final_tv_id['DEPTH'].astype(float)
### HEIGHT from text to float (centimeters)
final_tv_id['HEIGHT'] = final_tv_id['HEIGHT'].str.replace(" cm","").replace("No Information", np.nan)
mask_mm = (final_tv_id['HEIGHT'].str.find(" mm") > 0)
final_tv_id.loc[ mask_mm,['HEIGHT']] = final_tv_id.loc[ mask_mm,'HEIGHT'].str.replace(" mm","").astype(float)/10
mask_po = (final_tv_id['HEIGHT'].str.find(' "') > 0)
final_tv_id.loc[ mask_po,['HEIGHT']] = final_tv_id.loc[ mask_po,'HEIGHT'].str.replace(' "',"").astype(float)*2.54
final_tv_id['HEIGHT'] = final_tv_id['HEIGHT'].astype(float)
### WEIGHT from text to float (Kg)
final_tv_id['WEIGHT'] = final_tv_id['WEIGHT'].str.replace(" kg","").replace("No Information", np.nan)
mask_lb = (final_tv_id['WEIGHT'].str.find(" lb") > 0)
final_tv_id.loc[ mask_lb,['WEIGHT']] = final_tv_id.loc[ mask_lb,'WEIGHT'].str.replace(" lb","").astype(float) * 0.453592
final_tv_id['WEIGHT'] = final_tv_id['WEIGHT'].astype(float)
### WIDTH from text to float (centimeters)
final_tv_id['WIDTH'] = final_tv_id['WIDTH'].str.replace(" cm","").replace("No Information", np.nan)
mask_mm = (final_tv_id['WIDTH'].str.find(" mm") > 0)
final_tv_id.loc[ mask_mm,['WIDTH']] = final_tv_id.loc[ mask_mm,'WIDTH'].str.replace(" mm","").astype(float)/10
mask_po = (final_tv_id['WIDTH'].str.find(' "') > 0)
final_tv_id.loc[ mask_po,['WIDTH']] = final_tv_id.loc[ mask_po,'WIDTH'].str.replace(' "',"").astype(float)*2.54
final_tv_id['WIDTH'] = final_tv_id['WIDTH'].astype(float)
### Display from text to float
final_tv_id['DISPLAY_SIZE'] = final_tv_id['DISPLAY_SIZE'].str.replace(' "',"").replace("No Information", np.nan)
final_tv_id['DISPLAY_SIZE'] = final_tv_id['DISPLAY_SIZE'].str.replace(' in',"")
final_tv_id['DISPLAY_SIZE'] = final_tv_id['DISPLAY_SIZE'].str.replace(' pulgadas',"")
final_tv_id['DISPLAY_SIZE'] = final_tv_id['DISPLAY_SIZE'].astype(float)
### MAX_SPEAKERS_POWER from text to float (centimeters)
final_tv_id['MAX_SPEAKERS_POWER'] = final_tv_id['MAX_SPEAKERS_POWER'].str.replace(" W","").replace("No Information", np.nan)
final_tv_id['MAX_SPEAKERS_POWER'] = final_tv_id['MAX_SPEAKERS_POWER'].astype(float)
final_tv_id[['SPEAKERS_NUMBER','USB_PORTS_NUMBER','PROCESSOR_CORES_NUMBER','HDMI_PORTS_NUMBER']].dtypes
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(final_tv_id.loc[:,~final_tv_id.columns.isin(["__sold_quantity"])], final_tv_id ["__sold_quantity"], random_state = 123)
# Applying one-hot encoder in all categorical columns
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
str_columns = X_train.select_dtypes(include = ['object','category']).columns
ohe.fit(X_train[str_columns])
# all original columns are deleted from both dataframes
X_train[ohe.get_feature_names()] = pd.DataFrame( ohe.transform(X_train[str_columns]), index = X_train.index, columns = ohe.get_feature_names())
X_train.drop(str_columns, axis = 1, inplace = True)
X_test[ohe.get_feature_names()] = pd.DataFrame( ohe.transform(X_test[str_columns]), index = X_test.index, columns = ohe.get_feature_names())
X_test.drop(str_columns, axis = 1, inplace = True)
iterations = 1000
params = {
"eta" : hp.uniform("eta" , 0.1, 0.5),
"gamma" : hp.uniform("gamma" , 0 , 0.3),
"max_depth" : hp.randint("max_depth " , 2, 6),
"min_child_weight": hp.randint("min_child_weight", 1 , int(X_train.shape[0]*0.1) ),
"subsample" : hp.uniform("subsample" , 0.5, 1),
"sampling_method" : hp.choice( "sampling_method" ,['uniform','gradient_based']),
"colsample_bytree": hp.uniform("colsample_bytree", 0.1, 0.9),
"lambda" : hp.uniform("lambda" , 0.5 , 3),
"alpha" : hp.uniform("alpha" , 0.5 , 3),
"n_estimators" : hp.randint("n_estimators" , 30 , 100)
}
def f_nn(params):
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.set_params(**params)
xgb_model.fit(X_train, y_train, early_stopping_rounds=10,eval_metric="mae", eval_set=[(X_test, y_test)], verbose = False)
acc = mae(y_test, xgb_model.predict(X_test))
return {"loss": acc, "status": STATUS_OK}
trials = Trials()
from functools import partial
algo = partial(
tpe.suggest,
n_startup_jobs=int(iterations * 0.3),
gamma=0.25,
n_EI_candidates=200,
)
best = fmin(f_nn, params, algo=algo, max_evals=iterations, trials=trials)
best_params = space_eval(params, best)
best_params
step = 20
columns = X_train.columns
acc = []
n_poins = {}
n = len(columns)
while n > 0:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.set_params(**best_params)
xgb_model.fit(X_train[columns], y_train, early_stopping_rounds=10,eval_metric="mae", eval_set=[(X_test[columns], y_test)], verbose = False)
acc.append(mae(y_test, xgb_model.predict(X_test[columns])))
n_poins[n] = columns
fi = pd.DataFrame(xgb_model.feature_importances_, index = X_train[columns].columns).sort_values(0,ascending = False)
columns = fi.iloc[ 0: (fi.shape[0] - step),:].index
n = len(columns)
pd.DataFrame({"Number_columns": list(n_poins.keys()),"MAE":acc})
columns = n_poins[16] #sixteen columns
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.set_params(**best_params)
eval_set = [(X_train[columns], y_train), (X_test[columns], y_test)]
xgb_model.fit(X_train[columns], y_train, early_stopping_rounds=10,eval_metric=["mae",'rmse'], eval_set=eval_set, verbose = False)
results = xgb_model.evals_result()
epochs = len(results['validation_0']['mae'])
x_axis = range(0, epochs)
# Mean absolute Error
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['mae'], label='Train')
ax.plot(x_axis, results['validation_1']['mae'], label='Test')
ax.legend()
plt.ylabel('MAE')
plt.xlabel('Epoch')
plt.title('XGBoost - Mean Absolute Error (MAE)')
plt.show()
# Root Mean Squared Error
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
ax.legend()
plt.ylabel('RMSE')
plt.xlabel("Epoch")
plt.title('XGBoost - Root Mean Squared Error')
plt.show()
# training with less epochs: 35 .. reducing the gap between train and test
best_params['n_estimators'] = 35
columns = n_poins[16]
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.set_params(**best_params)
eval_set = [(X_train[columns], y_train), (X_test[columns], y_test)]
xgb_model.fit(X_train[columns], y_train, early_stopping_rounds=10,eval_metric=["mae",'rmse'], eval_set=eval_set, verbose = False)
# Root Mean Squared Error
columns = n_poins[16]
predict_trt = xgb_model.predict(X_train[columns])
predict_tst = xgb_model.predict(X_test[columns])
error_tst = predict_tst - y_test
error_trt = predict_trt - y_train
fig = go.Figure(go.Histogram(x=error_trt
,histnorm='percent'
,name = "Train"
,opacity = 0.5
, xbins=dict( # bins used for histogram
start=-500.0,
end=50.0,
size=5
)
)
)
fig.add_trace(go.Histogram(x=error_tst
,histnorm='percent'
,name = "Test"
,opacity = 0.5
, xbins=dict( # bins used for histogram
start=-500.0,
end=50.0,
size=5
)
)
)
fig.update_layout(yaxis=dict(title='Porcentaje'),
xaxis=dict(title="Error"),
title = 'Distribuicion de los erros')
fig.update_layout(barmode = 'overlay')
fig.show()
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_train[columns])
# Feature Importance
shap.plots.bar(shap_values)
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0])
# summarize the effects of all the features
shap.plots.beeswarm(shap_values)